Machine Learning Models for Geographic and Remote Work Analysis
KMeans Clustering, Salary Prediction, and Remote Work Classification
Authors
Affiliation
Bingrui Qiao
Zhengyu Zhou
Junhao Wang
Boston University
1 Classification: Remote vs Non-Remote Jobs
# Import necessary librariesimport pandas as pdimport plotly.express as pximport plotly.io as pioimport numpy as np# Set seed for reproducibilitynp.random.seed(42)# Set plotly rendererpio.renderers.default ="notebook+notebook_connected+vscode"# Initialize Spark Sessiondf = pd.read_csv("data/lightcast_job_postings.csv", low_memory=False)# Print schema and preview first few rowsprint("--- Diagnostic check: Schema and sample rows ---")print(df.info())print(df.head())
--- Diagnostic check: Schema and sample rows ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72498 entries, 0 to 72497
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: float64(38), object(93)
memory usage: 72.5+ MB
None
ID LAST_UPDATED_DATE \
0 1f57d95acf4dc67ed2819eb12f049f6a5c11782c 9/6/2024
1 0cb072af26757b6c4ea9464472a50a443af681ac 8/2/2024
2 85318b12b3331fa490d32ad014379df01855c557 9/6/2024
3 1b5c3941e54a1889ef4f8ae55b401a550708a310 9/6/2024
4 cb5ca25f02bdf25c13edfede7931508bfd9e858f 6/19/2024
LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION \
0 2024-09-06 20:32:57.352 Z 0.0 6/2/2024 6/8/2024 6.0
1 2024-08-02 17:08:58.838 Z 0.0 6/2/2024 8/1/2024 NaN
2 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/7/2024 35.0
3 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/20/2024 48.0
4 2024-06-19 07:00:00.000 Z 0.0 6/2/2024 6/17/2024 15.0
SOURCE_TYPES SOURCES \
0 [\n "Company"\n] [\n "brassring.com"\n]
1 [\n "Job Board"\n] [\n "maine.gov"\n]
2 [\n "Job Board"\n] [\n "dejobs.org"\n]
3 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n]
4 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n]
URL ... NAICS_2022_2 \
0 [\n "https://sjobs.brassring.com/TGnewUI/Sear... ... 44.0
1 [\n "https://joblink.maine.gov/jobs/1085740"\n] ... 56.0
2 [\n "https://dejobs.org/dallas-tx/data-analys... ... 52.0
3 [\n "https://www.disabledperson.com/jobs/5948... ... 52.0
4 [\n "https://modesto.craigslist.org/sls/77475... ... 99.0
NAICS_2022_2_NAME NAICS_2022_3 \
0 Retail Trade 441.0
1 Administrative and Support and Waste Managemen... 561.0
2 Finance and Insurance 524.0
3 Finance and Insurance 522.0
4 Unclassified Industry 999.0
NAICS_2022_3_NAME NAICS_2022_4 \
0 Motor Vehicle and Parts Dealers 4413.0
1 Administrative and Support Services 5613.0
2 Insurance Carriers and Related Activities 5242.0
3 Credit Intermediation and Related Activities 5221.0
4 Unclassified Industry 9999.0
NAICS_2022_4_NAME NAICS_2022_5 \
0 Automotive Parts, Accessories, and Tire Retailers 44133.0
1 Employment Services 56132.0
2 Agencies, Brokerages, and Other Insurance Rela... 52429.0
3 Depository Credit Intermediation 52211.0
4 Unclassified Industry 99999.0
NAICS_2022_5_NAME NAICS_2022_6 \
0 Automotive Parts and Accessories Retailers 441330.0
1 Temporary Help Services 561320.0
2 Other Insurance Related Activities 524291.0
3 Commercial Banking 522110.0
4 Unclassified Industry 999999.0
NAICS_2022_6_NAME
0 Automotive Parts and Accessories Retailers
1 Temporary Help Services
2 Claims Adjusting
3 Commercial Banking
4 Unclassified Industry
[5 rows x 131 columns]
from sklearn.model_selection import train_test_splittrain_data, test_data = train_test_split( df_prepared, test_size=0.3, # 30% for test random_state=42, # seed=42)print(f"Training set size: {len(train_data)}")print(f"Test set size: {len(test_data)}")
# Find optimal K using Elbow Methodimport numpy as npfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_score# Extract the feature matrix from the df_clusterX_cluster = np.stack(df_cluster["features"].to_numpy()) # shape: (n_samples, n_features)k_values = [3, 5, 7, 10, 15, 20]silhouette_scores = []print("--- Finding Optimal K ---")for k in k_values: kmeans = KMeans( n_clusters=k, random_state=42, # seed = 42 n_init="auto" ) labels = kmeans.fit_predict(X_cluster) score = silhouette_score(X_cluster, labels) silhouette_scores.append(score)print(f"K = {k}: Silhouette Score = {score:.4f}")
--- Finding Optimal K ---
K = 3: Silhouette Score = 0.5132
K = 5: Silhouette Score = 0.4411
K = 7: Silhouette Score = 0.3804
K = 10: Silhouette Score = 0.3590
K = 15: Silhouette Score = 0.3334
K = 20: Silhouette Score = 0.2926
7 Extract the feature matrix
import numpy as npfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_score# Extract the feature matrixX_cluster = np.stack(df_cluster["features"].to_numpy()) # shape: (n_samples, n_features)# Train the final KMeans modeloptimal_k =3kmeans_final = KMeans( n_clusters=optimal_k, random_state=42, # seed = 42 n_init="auto")cluster_labels = kmeans_final.fit_predict(X_cluster) # Write the cluster back to the DataFramedf_clustered = df_cluster.copy()df_clustered["cluster"] = cluster_labels # 4. calculate silhouettefinal_score = silhouette_score(X_cluster, cluster_labels)print(f"\n--- Final KMeans Model (K={optimal_k}) ---")print(f"Silhouette Score: {final_score:.4f}")# Cluster Distributionprint("\n--- Cluster Distribution ---")print( df_clustered["cluster"] .value_counts() .sort_index())
--- Final KMeans Model (K=3) ---
Silhouette Score: 0.5132
--- Cluster Distribution ---
cluster
0 19473
1 39891
2 13134
Name: count, dtype: int64
8 Analyze clusters vs ONET reference labels
# Cross-tabulation of clusters and ONETprint("--- Top ONET categories in each cluster ---")for cluster_id inrange(optimal_k):print(f"\n=== Cluster {cluster_id} ===") top_onet_df = ( df_clustered[df_clustered["cluster"] == cluster_id]["ONET_NAME"] .value_counts() .reset_index() .rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"}) .head(5) )print(top_onet_df)
--- Top ONET categories in each cluster ---
=== Cluster 0 ===
count count
0 Business Intelligence Analysts 19472
1 Unknown 1
=== Cluster 1 ===
count count
0 Business Intelligence Analysts 39851
1 Unknown 40
=== Cluster 2 ===
count count
0 Business Intelligence Analysts 13131
1 Unknown 3
9 Analyze cluster characteristics
print("--- Cluster Characteristics ---")# === Remote work distribution by cluster ===print("\n=== Remote Work by Cluster ===")remote_by_cluster = ( df_clustered .groupby(["cluster", "REMOTE_GROUP"]) .size() .reset_index(name="count") .sort_values(["cluster", "REMOTE_GROUP"]))print(remote_by_cluster)remote_pivot = remote_by_cluster.pivot( index="cluster", columns="REMOTE_GROUP", values="count").fillna(0).astype(int)print("\nRemote work pivot table:")print(remote_pivot)# === Experience level by cluster ===print("\n=== Experience Level by Cluster ===")exp_by_cluster = ( df_clustered .groupby(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"]) .size() .reset_index(name="count") .sort_values(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"]))print(exp_by_cluster)exp_pivot = exp_by_cluster.pivot( index="cluster", columns="MIN_YEARS_EXPERIENCE_GROUP", values="count").fillna(0).astype(int)print("\nExperience level pivot table:")print(exp_pivot)# === Top states by cluster ===print("\n=== Top States by Cluster ===")for cluster_id inrange(optimal_k):print(f"\n--- Cluster {cluster_id} Top States ---") top_states = ( df_clustered[df_clustered["cluster"] == cluster_id]["STATE_NAME"] .value_counts() .head(5) )print(top_states)
--- Cluster Characteristics ---
=== Remote Work by Cluster ===
cluster REMOTE_GROUP count
0 0 Hybrid 540
1 0 Onsite 15203
2 0 Remote 3730
3 1 Hybrid 1177
4 1 Onsite 32518
5 1 Remote 6196
6 2 Hybrid 543
7 2 Onsite 10020
8 2 Remote 2571
Remote work pivot table:
REMOTE_GROUP Hybrid Onsite Remote
cluster
0 540 15203 3730
1 1177 32518 6196
2 543 10020 2571
=== Experience Level by Cluster ===
cluster MIN_YEARS_EXPERIENCE_GROUP count
0 0 Expert 922
1 0 Internship/Entry Level 7249
2 0 Junior 3628
3 0 Mid-Level 3894
4 0 Senior 3780
5 1 Expert 1847
6 1 Internship/Entry Level 14474
7 1 Junior 7179
8 1 Mid-Level 7681
9 1 Senior 8710
10 2 Expert 705
11 2 Internship/Entry Level 4953
12 2 Junior 2438
13 2 Mid-Level 2348
14 2 Senior 2690
Experience level pivot table:
MIN_YEARS_EXPERIENCE_GROUP Expert Internship/Entry Level Junior Mid-Level \
cluster
0 922 7249 3628 3894
1 1847 14474 7179 7681
2 705 4953 2438 2348
MIN_YEARS_EXPERIENCE_GROUP Senior
cluster
0 3780
1 8710
2 2690
=== Top States by Cluster ===
--- Cluster 0 Top States ---
STATE_NAME
Texas 2098
California 2041
Florida 1069
Virginia 1000
New York 801
Name: count, dtype: int64
--- Cluster 1 Top States ---
STATE_NAME
Texas 4519
California 3839
Illinois 2417
Virginia 2082
New York 1969
Name: count, dtype: int64
--- Cluster 2 Top States ---
STATE_NAME
Texas 1450
California 1204
Florida 636
New York 571
Virginia 554
Name: count, dtype: int64
10 Visualization: Elbow Plot
import pandas as pdimport plotly.express as px# obtain the cluster frequencycluster_counts_series = ( df_clustered["cluster"] .value_counts() .sort_index())# Clearly create a DataFrame with only two columns, "cluster" and "count"cluster_counts = pd.DataFrame({"cluster": cluster_counts_series.index,"count": cluster_counts_series.values})print(cluster_counts) # Check if the column names are ['cluster', 'count']# Visualfig = px.bar( cluster_counts, x="cluster", y="count", title="KMeans Clustering: Distribution of Jobs Across Clusters", labels={"cluster": "Cluster", "count": "Number of Jobs"}, template="plotly_white", color="count", color_continuous_scale="Blues",)fig.update_layout(font=dict(family="Roboto", size=14))fig.write_image("figures/kmeans_elbow_plot.jpg")fig
cluster count
0 0 19473
1 1 39891
2 2 13134
11 Visualization: Cluster Distribution
import pandas as pdimport plotly.express as px# Visualization: Cluster Distribution# Get cluster counts(mapping = {0: 2, 1: 0, 2: 1}df_clustered["cluster_spark"] = df_clustered["cluster"].map(mapping)cluster_counts = ( df_clustered["cluster_spark"] .value_counts() .sort_index())fig = px.bar( x=cluster_counts.index, y=cluster_counts.values, color=cluster_counts.values, color_continuous_scale="Blues", labels={"x": "Cluster", "y": "Number of Jobs", "color": "Number of Jobs"}, title="KMeans Clustering: Distribution of Jobs Across Clusters", template="plotly_white",)fig.update_layout(font=dict(family="Roboto", size=14))fig.write_image("figures/kmeans_cluster_distribution.jpg")fig
12 Choropleth Map: Remote Work Percentage by State
import pandas as pdimport numpy as np# Calculate remote work percentage by state (pandas version)# 1.STATE_NAME & REMOTE_GROUPstate_remote = ( df_analysis .groupby(["STATE_NAME", "REMOTE_GROUP"]) .size() .reset_index(name="count"))# 2. pivot:index = STATE_NAME,columns = REMOTE_GROUP(Onsite / Remote / Hybrid)state_df = ( state_remote .pivot(index="STATE_NAME", columns="REMOTE_GROUP", values="count") .fillna(0) .reset_index())# Ensure that the "Hybrid" column existsif"Hybrid"notin state_df.columns: state_df["Hybrid"] =0# Calculate Total and Remote_Pctstate_df["Total"] = state_df["Onsite"] + state_df["Remote"] + state_df["Hybrid"]state_df["Remote_Pct"] = (state_df["Remote"] / state_df["Total"] *100).round(2)print("--- State Remote Work Data ---")print(state_df.head(10))
States with data: 50
REMOTE_GROUP STATE_NAME State_Abbrev Total Remote_Pct
0 Alabama AL 690.0 15.65
1 Alaska AK 236.0 41.10
2 Arizona AZ 1638.0 15.02
3 Arkansas AR 584.0 18.49
4 California CA 7084.0 14.91
5 Colorado CO 1455.0 17.80
6 Connecticut CT 863.0 18.77
7 Delaware DE 438.0 21.23
8 Florida FL 3645.0 14.07
9 Georgia GA 2658.0 15.99
14 Choropleth Map with State Labels showing Remote Percentage
import plotly.graph_objects as go# Choropleth Map with State Labels showing Remote Percentagefig = go.Figure(data=go.Choropleth( locations=state_df_clean['State_Abbrev'], z=state_df_clean['Remote_Pct'], locationmode='USA-states', colorscale='Blues', colorbar_title='Remote %', text=state_df_clean['State_Abbrev'] +'<br>'+ state_df_clean['Remote_Pct'].astype(str) +'%', hovertemplate='<b>%{text}</b><br>Total Jobs: %{customdata[0]}<br>Remote Jobs: %{customdata[1]}<extra></extra>', customdata=state_df_clean[['Total', 'Remote']].values, marker_line_color='white', marker_line_width=1))# Add state abbreviations with percentages as annotationsfig.add_scattergeo( locations=state_df_clean['State_Abbrev'], locationmode='USA-states', text=state_df_clean['Remote_Pct'].apply(lambda x: f'{x:.0f}%'), mode='text', textfont=dict(size=8, color='black', family='Arial Black'), showlegend=False)fig.update_layout( title_text='Remote Work Opportunity by State (% of Jobs that are Remote)', title_font_size=18, geo=dict( scope='usa', projection_type='albers usa', showlakes=True, lakecolor='rgb(255, 255, 255)', bgcolor='rgba(0,0,0,0)' ), template='plotly_white', font=dict(family="Roboto", size=14), height=600, width=1000)fig.write_image("figures/choropleth_remote_work_with_labels.jpg")print("Enhanced choropleth map saved!")fig
Enhanced choropleth map saved!
15 Choropleth Map: Total Job Postings by State (with labels)
import plotly.graph_objects as go# Choropleth Map: Total Job Postings by State (with labels)# Get top 15 states by total jobs for labelingtop_states = state_df_clean.nlargest(15, 'Total')['State_Abbrev'].tolist()fig = go.Figure(data=go.Choropleth( locations=state_df_clean['State_Abbrev'], z=state_df_clean['Total'], locationmode='USA-states', colorscale='Greens', colorbar_title='Total Jobs', hovertemplate='<b>%{location}</b><br>Total Jobs: %{z:,}<br>Remote: %{customdata[0]:,}<br>Onsite: %{customdata[1]:,}<extra></extra>', customdata=state_df_clean[['Remote', 'Onsite']].values, marker_line_color='white', marker_line_width=1.5))# Add labels for top states (format large numbers with K)top_state_df = state_df_clean[state_df_clean['State_Abbrev'].isin(top_states)].copy()top_state_df['Total_Label'] = top_state_df['Total'].apply(lambda x: f'{x/1000:.1f}K'if x >=1000elsestr(int(x)))fig.add_scattergeo( locations=top_state_df['State_Abbrev'], locationmode='USA-states', text=top_state_df['Total_Label'], mode='text', textfont=dict(size=10, color='darkgreen', family='Arial Black'), showlegend=False)fig.update_layout( title_text='Total Job Postings by State<br><sup>Labels shown for top 15 states by job volume</sup>', title_font_size=16, geo=dict( scope='usa', projection_type='albers usa', showlakes=True, lakecolor='rgb(255, 255, 255)' ), template='plotly_white', font=dict(family="Roboto", size=14), height=600, width=1000)fig.write_image("figures/choropleth_total_jobs_with_labels.jpg")print("Enhanced total jobs choropleth map saved!")fig
Enhanced total jobs choropleth map saved!
16 Bar Chart: Top 10 States by Remote Work Percentage
# Filter states with at least 100 jobs for meaningful comparisonstate_df_filtered = state_df_clean[state_df_clean['Total'] >=100]# Sort by remote percentagetop_remote_states = state_df_filtered.nlargest(10, 'Remote_Pct')fig = px.bar( top_remote_states, x='STATE_NAME', y='Remote_Pct', color='Remote_Pct', color_continuous_scale='Blues', title='Top 10 States with Highest Remote Work Opportunities', labels={'STATE_NAME': 'State', 'Remote_Pct': 'Remote Work %'}, text='Remote_Pct')fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')fig.update_layout( template='plotly_white', font=dict(family="Roboto", size=14), xaxis_tickangle=-45, showlegend=False)fig.write_image("figures/top10_remote_states.jpg")print("Top 10 remote states bar chart saved!")fig